{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TASK 1 - True predictions for 2021\n",
    "\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import logging\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from joblib import dump, load\n",
    "\n",
    "import views\n",
    "from views import Ensemble, Model, Downsampling, Period\n",
    "from views.utils.data import assign_into_df\n",
    "from views.apps.transforms import lib as translib\n",
    "from views.apps.evaluation import lib as evallib, feature_importance as fi\n",
    "from views.apps.model import api\n",
    "from views.apps.extras import extras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = views.DATASETS[\"cm_africa_imp_0\"]\n",
    "df = dataset.df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our data\n",
    "wiki = pd.read_csv(r'/home/default/Dokumente/TRINITY/HT2020/comp/prediction-project/data/Wiki_Data_Final_2021.csv')\n",
    "multi = wiki.set_index(['month_id', 'country_id'])\n",
    "multi = multi.sort_index(level=0)\n",
    "\n",
    "#print(multi)\n",
    "df = df.merge(multi, on=['month_id', 'country_id'], how='left')\n",
    "#df.sort_index(level=1)\n",
    "df = df.fillna(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Keeping periods in a list lets us easily expand this as the \n",
    "# updated data becomes available\n",
    "period_calib = api.Period(\n",
    "   name=\"calib\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=444,     # 2016.12\n",
    "   predict_start=445, # 2017.01\n",
    "   predict_end=488,   # 2020.08\n",
    ")\n",
    "\n",
    "\n",
    "period_test = api.Period(\n",
    "  name=\"test\", \n",
    "  train_start=121,   # 1990-01\n",
    "  train_end=488,     # 2020.08\n",
    "  predict_start=490, # 2020.10\n",
    "  predict_end=495,   # 2021.03\n",
    ")\n",
    "\n",
    "\n",
    "periods = [period_calib, period_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The steps to train, predict and evaluate for.\n",
    "steps = [2,3,4,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_features = df[['ged_best_sb','wdi_vc_btl_deth', 'reign_precip','reign_prev_conflict','reign_tenure_months',\n",
    "           'reign_irregular','reign_lastelection','reign_loss','reign_pctile_risk','reign_couprisk',\n",
    "          'ged_count_sb','ged_best_os','ged_count_os','wdi_eg_use_pcap_kg_oe','ged_best_ns',\n",
    "           'vdem_v2x_accountability', 'wdi_nv_srv_totl_zs','wdi_sp_pop_totl','wdi_sl_tlf_totl_fe_zs',\n",
    "          'wdi_sm_pop_totl_zs', 'wdi_sm_pop_refg_or', 'wdi_dt_oda_odat_pc_zs', 'ged_count_ns', \n",
    "           'reign_gov_dominant_party', 'reign_age', 'vdem_v2xpe_exlpol', 'wdi_ag_lnd_frst_k2', \n",
    "           'wdi_bg_gsr_nfsv_gd_zs', 'wdi_st_int_rcpt_xp_zs', 'vdem_v2x_clpol', 'vdem_v2x_genpp'\n",
    "                   ,'viewsEnglish']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify number of estimators in RF estimator\n",
    "n_estimators = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "benchmark_delta = api.Model(\n",
    "    name=\"benchmark_delta\",                \n",
    "    col_outcome=\"ln_ged_best_sb\",    \n",
    "    cols_features=cols_features,     \n",
    "    steps=steps,                     \n",
    "    outcome_type=\"real\",             \n",
    "    periods=periods,                 \n",
    "    estimator=RandomForestRegressor( \n",
    "        n_estimators=n_estimators,\n",
    "        criterion=\"mse\",\n",
    "        n_jobs=-1,\n",
    "    ),\n",
    "    delta_outcome=True,            \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [benchmark_delta]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 17min 20s, sys: 21.3 s, total: 17min 41s\n",
      "Wall time: 15min 17s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Train all models\n",
    "for model in models:\n",
    "    model.fit_estimators(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store predictions and calibrated predictions for all models in our dataframe\n",
    "for model in models:\n",
    "    df_predictions = model.predict(df)\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "    df_predictions = model.predict_calibrated(\n",
    "        df=df,\n",
    "        period_calib = period_calib,\n",
    "        period_test = period_test\n",
    "    )\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_predictions.to_csv('pred.csv')  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>month_id</th>\n",
       "      <th>country_id</th>\n",
       "      <th>ged_best_sb</th>\n",
       "      <th>ln_ged_best_sb</th>\n",
       "      <th>d_ln_ged_best_sb</th>\n",
       "      <th>benchmark</th>\n",
       "      <th>oswald</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>490</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>490</td>\n",
       "      <td>41</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.442115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>490</td>\n",
       "      <td>42</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.003466</td>\n",
       "      <td>0.003466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>490</td>\n",
       "      <td>43</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>490</td>\n",
       "      <td>47</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-2.302585</td>\n",
       "      <td>0.275300</td>\n",
       "      <td>0.039006</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   month_id  country_id  ged_best_sb  ln_ged_best_sb  d_ln_ged_best_sb  \\\n",
       "0       490          40          0.0             0.0          0.000000   \n",
       "1       490          41          0.0             0.0          0.000000   \n",
       "2       490          42          0.0             0.0          0.000000   \n",
       "3       490          43          0.0             0.0          0.000000   \n",
       "4       490          47          0.0             0.0         -2.302585   \n",
       "\n",
       "   benchmark    oswald  \n",
       "0   0.000000  0.000000  \n",
       "1   0.000000  0.442115  \n",
       "2   0.003466  0.003466  \n",
       "3   0.000000  0.000000  \n",
       "4   0.275300  0.039006  "
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "########## CALCULATE MSE AND TADDE FOR TASK 1\n",
    "dat = pd.read_csv(r'/home/default/Dokumente/TRINITY/comp/OpenViEWS2-master/projects/model_development/Analysis_Review/t1_cm.csv')\n",
    "dat.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mse(actual, pred): \n",
    "    actual, pred = np.array(actual), np.array(pred)\n",
    "    return np.square(np.subtract(actual,pred)).mean() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1.0741900547433738, 0.7593389243487366, 0.9339635122612491, 1.3455800197228016, 1.504975745698837, 0.4985225807940971]\n"
     ]
    }
   ],
   "source": [
    "#### MSE WITH WIKI VIEWS\n",
    "\n",
    "mse_t1 = []\n",
    "\n",
    "s2 = dat.iloc[0:54,]\n",
    "s3 = dat.iloc[54:108,]\n",
    "s4 = dat.iloc[108:162,]\n",
    "s5 = dat.iloc[162:216,]\n",
    "s6 = dat.iloc[216:270,]\n",
    "s7 = dat.iloc[270:324,]\n",
    "\n",
    "mse_t1.append(mse(s2.d_ln_ged_best_sb, s2.oswald))\n",
    "mse_t1.append(mse(s3.d_ln_ged_best_sb, s3.oswald))\n",
    "mse_t1.append(mse(s4.d_ln_ged_best_sb, s4.oswald))\n",
    "mse_t1.append(mse(s5.d_ln_ged_best_sb, s5.oswald))\n",
    "mse_t1.append(mse(s6.d_ln_ged_best_sb, s6.oswald))\n",
    "mse_t1.append(mse(s7.d_ln_ged_best_sb, s7.oswald))\n",
    "\n",
    "print(mse_t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1.812883279689574, 2.017411060510773, 2.102652436065627, 2.0229100774581337, 2.439531714531185, 0.8670020903828731]\n"
     ]
    }
   ],
   "source": [
    "#### MSE BENCHMARK\n",
    "\n",
    "mse_t1 = []\n",
    "\n",
    "s2 = dat.iloc[0:54,]\n",
    "s3 = dat.iloc[54:108,]\n",
    "s4 = dat.iloc[108:162,]\n",
    "s5 = dat.iloc[162:216,]\n",
    "s6 = dat.iloc[216:270,]\n",
    "s7 = dat.iloc[270:324,]\n",
    "\n",
    "mse_t1.append(mse(s2.d_ln_ged_best_sb, s2.benchmark))\n",
    "mse_t1.append(mse(s3.d_ln_ged_best_sb, s3.benchmark))\n",
    "mse_t1.append(mse(s4.d_ln_ged_best_sb, s4.benchmark))\n",
    "mse_t1.append(mse(s5.d_ln_ged_best_sb, s5.benchmark))\n",
    "mse_t1.append(mse(s6.d_ln_ged_best_sb, s6.benchmark))\n",
    "mse_t1.append(mse(s7.d_ln_ged_best_sb, s7.benchmark))\n",
    "\n",
    "print(mse_t1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
